#!/bin/bash
#SBATCH --job-name=run_eval_master_tr_XXX
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --partition=cpu_p1
#SBATCH --qos=qos_cpu-dev
#SBATCH --cpus-per-task=1           # number of cores per tasks
#SBATCH --hint=nomultithread         # we get physical cores not logical
#SBATCH --time 00:30:00              # maximum execution time (HH:MM:SS)
#SBATCH --output=/gpfsscratch/rech/cnw/commun/experiments/local_experiment_dir/evals/run_eval_master/logs/%x_%j.out
#SBATCH --account=cnw@cpu
#SBATCH --mail-type=FAIL,INVALID_DEPEND,REQUEUE,STAGE_OUT,TIME_LIMIT
#SBATCH --mail-user=hf-m4-jz@googlegroups.com
#SBATCH --no-requeue

# ---------------------------------------------------------------------------------------------------------

set -x -e

source $cnw_ALL_CCFRWORK/start-m4-user

# We are on an offline partition
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1

# ---------------------------------------------------------------------------------------------------------

# ENV AND REPO

CONDA_ENV_NAME=m4-user  # Use your conda env
conda activate $CONDA_ENV_NAME

WORKING_DIR=$WORK/repos/m4  # Use your path on Work
pushd $WORKING_DIR
# export PYTHONPATH=$WORKING_DIR:$PYTHONPATH # Apparently done in run_evals_local_dataset.slurm

# ---------------------------------------------------------------------------------------------------------

# TRAINING

SWEEP="tr_ZZZ"  # Leave to the empty string "" when no sweep
RUN_NAME="tr_XXX"

# ---------------------------------------------------------------------------------------------------------

# TASK VARIABLES

NUM_SHOTS="0"
SHOT_SELECTION_MODE="rices"
DATASET_SPLIT="test"

# The dev tasks
TASKS_TO_CHECK=(
    "VQAv2SampleVOPTOpenEndedVQAInContextAcc"
    "CocoSampleVOPTImageCaptioningInContextBleuCiderMeteorRouge"
    "IIIT5KSampleVOPTClassificationInContextAccWithKLAndEntropy"
    "SimpleImageNet1kSampleVOPTClassificationInContextAccWithKLAndEntropy"
)

# ---------------------------------------------------------------------------------------------------------

# MOST IMPORTANT VARIABLES TO TWEAK

SHOW_GPU_MEM_UTIL="False"  # Useful to debug and find the best configuration

TYPE_SHARDING="ZeRO-3"  # "Zero-3", "DDP"
TYPE_GPU="v100"  # "a100", "v100"
NUM_NODES="1" # Doesn't currently work with number > 1
NUM_GPUS_PER_NODE="4"  # max is 8 for a100, 4 for v100. min((NUM_CPUS_PER_TASK * max_num_gpus_per_node) / max_num_cpus_per_node, NUM_GPUS_PER_NODE) GPUs are automatically allocated
MODEL_PRECISION="fp32"  # "fp32", "bf16", "fp16"
MINI_BATCH_SIZE="8"  # Heavily dependent on all of the above

NUM_HOURS="06"

# ---------------------------------------------------------------------------------------------------------

# VARIABLES THAT SHOULDN'T NEED TO BE TOUCHED

TOKENIZER_USE_FAST="False"  # False for OPT-1.3B (True when https://github.com/huggingface/transformers/pull/20823 is merged), True for OPT-13B or other LM backbones

EVALUATION_VERSION="v2"

EVALUATION_LOCAL_DATASETS="/gpfsscratch/rech/cnw/commun/local_datasets/"

CHECKPOINTS_DIR="/gpfsscratch/rech/cnw/commun/experiments/local_experiment_dir/${SWEEP}/${RUN_NAME}"

EVALUATION_JSONL_FILE="/gpfsscratch/rech/cnw/commun/experiments/local_experiment_dir/evals/results/${RUN_NAME}_evaluations.jsonl"

SAVE_DIR_CONFIGS="${CHECKPOINTS_DIR}/evaluation_configs"
mkdir -p $SAVE_DIR_CONFIGS
ACCELERATE_CONFIG_FILE="${SAVE_DIR_CONFIGS}/${SLURM_JOB_ID}_accelerate_config.yaml.autogenerated"
DEEPSPEED_CONFIG_FILE="${SAVE_DIR_CONFIGS}/${SLURM_JOB_ID}_ds_config.json.autogenerated"

# ---------------------------------------------------------------------------------------------------------

# AUTOMATICALLY FILLED VARIABLES

NUM_PROCESSES=$((NUM_GPUS_PER_NODE*NUM_NODES))

if [ $TYPE_GPU == "v100" ]; then
    NUM_CPUS_PER_GPU=10
elif [ $TYPE_GPU == "a100" ]; then
    NUM_CPUS_PER_GPU=8
else
    echo "Unknown type of GPU. Exiting."
    exit
fi
NUM_CPUS_PER_TASK=$((NUM_CPUS_PER_GPU*NUM_GPUS_PER_NODE))

if [ $TYPE_SHARDING == "ZeRO-3" ]; then
    DEEPSPEED_CONFIG="
  deepspeed_multinode_launcher: standard
  deepspeed_config_file: $DEEPSPEED_CONFIG_FILE
  zero3_init_flag: true"
    DISTRIBUTED_TYPE="DEEPSPEED"
elif [ $TYPE_SHARDING == "DDP" ]; then
    DEEPSPEED_CONFIG="{}"
    DISTRIBUTED_TYPE="MULTI_GPU"
else
    echo "Unknown sharding type. Exiting."
    exit
fi

if [ "$MODEL_PRECISION" == "fp32" ]; then
    ENABLE_FP16="false"
    ENABLE_BF16="false"
elif [ "$MODEL_PRECISION" == "fp16" ]; then
    ENABLE_FP16="true"
    ENABLE_BF16="false"
elif [ "$MODEL_PRECISION" == "bf16" ]; then
    ENABLE_FP16="false"
    ENABLE_BF16="true"
fi

cat << EOT > $ACCELERATE_CONFIG_FILE
# WARNING: do not edit this file as this is an slurm-auto-generated file
compute_environment: LOCAL_MACHINE
deepspeed_config: $DEEPSPEED_CONFIG
distributed_type: $DISTRIBUTED_TYPE
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: null
main_training_function: main
num_machines: null
num_processes: null
use_cpu: false
EOT

# Auto-generate the DS config
cat << EOT > $DEEPSPEED_CONFIG_FILE
{
    "zero_optimization": {
        "stage": 3,
        "offload_param": {
            "device": "none"
        }
    },
    "fp16": {
        "enabled": $ENABLE_FP16
    },
    "bf16": {
        "enabled": $ENABLE_BF16
    },
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto"
}
EOT

if [[ ! -f $ACCELERATE_CONFIG_FILE ]] ; then
    echo "File $ACCELERATE_CONFIG_FILE is not there, aborting."
    exit
fi

if [ $TYPE_GPU == "v100" ]; then
    CONSTRAINT="v100-32g"
    ACCOUNT="cnw@v100"
elif [ $TYPE_GPU == "a100" ]; then
    CONSTRAINT="a100"
    ACCOUNT="cnw@a100"
else
    echo "Unknown type of GPU. Exiting."
    exit
fi

if (("$NUM_NODES" > 1)); then
    echo "NUM_NODES > 1 is not yet supported in the evaluation pipeline. Exiting."
    exit
fi

# ---------------------------------------------------------------------------------------------------------

# MODEL GENERATION PARAMETERS

NUM_BEAMS="3"
NO_REPEAT_NGRAM_SIZE="0"
MAX_NEW_TOKENS="15"

# ---------------------------------------------------------------------------------------------------------

# ESTABLISHING THE REMAINING EVALUATIONS TO BE DONE

DIR_MODEL_CHECKPOINTS=(`ls -d $CHECKPOINTS_DIR/opt_step*/unwrapped_model`)
TASKS_TO_DO=()
DIR_CHECKPOINTS=()

# Here we will retrieve all the evaluations that have not been done
for task in ${TASKS_TO_CHECK[@]}
do
    for step in ${DIR_MODEL_CHECKPOINTS[@]}
    do

        found_eval="0"
        if test -f "$EVALUATION_JSONL_FILE"; then
            eval_check=$(grep "\"task\": \"$task\"" $EVALUATION_JSONL_FILE \
                | grep $step \
                | grep "\"num_shots\": ${NUM_SHOTS}" \
                | grep "\"shot_selection_mode\": \"${SHOT_SELECTION_MODE}\"" \
                | grep "\"num_beams\": ${NUM_BEAMS}" \
                | grep "\"no_repeat_ngram_size\": ${NO_REPEAT_NGRAM_SIZE}" \
                | grep "\"max_new_tokens\": ${MAX_NEW_TOKENS}" \
                || echo "")
            if [[ ! -z "$eval_check" ]]; then
                found_eval="1"
            fi
        fi

        if [ "$found_eval" -eq "0" ]
        then
            echo "Eval not done before: $task with $step, $NUM_SHOTS shots and $SHOT_SELECTION_MODE shot selection"
            TASKS_TO_DO+=($task)
            DIR_CHECKPOINTS+=(${step::-16}) # len("/unwrapped_model") = 16
        else
            echo "Eval done before: $task with $step, $NUM_SHOTS shots and $SHOT_SELECTION_MODE shot selection"
        fi
    done
done

LEN=${#TASKS_TO_DO[@]}
LEN=$((LEN-1))

echo $LEN
echo ${TASKS_TO_DO[@]}
echo ${DIR_CHECKPOINTS[@]}

if (("$LEN" < 0)); then
    echo "All checkpoints and tasks have been evaluated. Exiting."
    exit
fi

joinByChar() {
  local IFS="$1"
  shift
  echo "$*"
}

# Here we transform the array into a string where each element of the list is separated from the next and previous ones
# by |
# This change is necessary because it seems that it is not possible to pass an array variable to a slurm job with the
#--export command, but it is possible to pass a string
DIR_CHECKPOINTS_STRING=$(joinByChar '|' ${DIR_CHECKPOINTS[@]})
TASKS_TO_DO_STRING=$(joinByChar '|' ${TASKS_TO_DO[@]})

# ---------------------------------------------------------------------------------------------------------

# LAUNCHING THE EVALUATIONS

ENV_VARIABLES=(
"ALL"
"CONDA_ENV_NAME=$CONDA_ENV_NAME"
"WORKING_DIR=$WORKING_DIR"
"DIR_CHECKPOINTS_STRING=$DIR_CHECKPOINTS_STRING"
"TASKS_TO_DO_STRING=$TASKS_TO_DO_STRING"
"EVALUATION_LOCAL_DATASETS=$EVALUATION_LOCAL_DATASETS"
"EVALUATION_FILE=$EVALUATION_JSONL_FILE"
"NUM_SHOTS=$NUM_SHOTS"
"SHOT_SELECTION_MODE=$SHOT_SELECTION_MODE"
"NUM_BEAMS=$NUM_BEAMS"
"NO_REPEAT_NGRAM_SIZE=$NO_REPEAT_NGRAM_SIZE"
"MAX_NEW_TOKENS=$MAX_NEW_TOKENS"
"BATCH_SIZE=8"
"MINI_BATCH_SIZE=$MINI_BATCH_SIZE"
"ACCELERATE_CONFIG_FILE=$ACCELERATE_CONFIG_FILE"
"NUM_PROCESSES=$NUM_PROCESSES"
"TOKENIZER_USE_FAST"=$TOKENIZER_USE_FAST
"EVALUATION_VERSION"=$EVALUATION_VERSION
"MODEL_PRECISION"=$MODEL_PRECISION
"SHOW_GPU_MEM_UTIL"=$SHOW_GPU_MEM_UTIL
"DATASET_SPLIT=$DATASET_SPLIT"
)
EXPORTED_ENV_VARIABLES=$(joinByChar ',' ${ENV_VARIABLES[@]})

# We launch a job array in which each job will launch an evaluation
SBATCH_CMD="sbatch \
    --nodes=${NUM_NODES} \
    --gres=gpu:${NUM_GPUS_PER_NODE} \
    --cpus-per-task=${NUM_CPUS_PER_TASK} \
    --time ${NUM_HOURS}:00:00 \
    --array=0-$LEN \
    --job-name=auto_eval_${RUN_NAME}_${NUM_SHOTS}_shot_${SHOT_SELECTION_MODE} \
    --constraint=$CONSTRAINT \
    --account=$ACCOUNT \
    --export=$EXPORTED_ENV_VARIABLES \
    experiments/evaluation/vloom/common/run_evals_local_datasets.slurm"
ID_JOB=$($SBATCH_CMD)

# ---------------------------------------------------------------------------------------------------------

# SYNCHRONIZING THE RESULTS OF THE EVALUATIONS

# We get the id of the previous job array
echo $ID_JOB # e.g. "Submitted batch job 773935"
ID_JOB=${ID_JOB##* }
echo $ID_JOB # e.g. "773935"

# We schedule the synchronization with wandb once the evaluations are finished
sbatch \
    --dependency=afterany:$ID_JOB \
    --job-name=gcs_sync_eval_${RUN_NAME}_${NUM_SHOTS}_shot_${SHOT_SELECTION_MODE} \
    --export=ALL,EVALUATION_JSONL_FILE=$EVALUATION_JSONL_FILE \
    experiments/evaluation/vloom/common/sync_evaluations_on_gcs.slurm

# ---------------------------------------------------------------------------------------------------------
